1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.smartcrawler.extractor;
28 import java.io.File;
29 import java.io.FileInputStream;
30 import java.io.InputStream;
31 import java.net.URL;
32 import java.util.ArrayList;
33 import java.util.List;
34 import org.apache.commons.digester.Digester;
35 import org.apache.log4j.Logger;
36 import org.smartcrawler.common.SCLogger;
37 import org.smartcrawler.extractor.pattern.AbstractPattern;
38 import org.smartcrawler.extractor.pattern.ConcretePattern;
39
40 /***
41 *
42 *
43 * @author <a href="mailto:pozzad@alice.it">Davide Pozza</a>
44 * @version <tt>$Revision: 1.6 $</tt>
45 */
46 public class PatternProvider {
47
48 private static PatternProvider paProv = null;
49 private AbstractPattern[] patterns = null;
50 private static Logger log = SCLogger.getLogger(PatternProvider.class);
51
52 /*** Creates a new instance of PatternProvider */
53 private PatternProvider() {
54 init();
55 }
56
57 /***
58 *
59 * @return
60 */
61 public static synchronized PatternProvider instance() {
62 if (paProv == null) {
63 paProv = new PatternProvider();
64 }
65 return paProv;
66 }
67
68 private void init() {
69 InputStream input = null;
70 try {
71 String path = System.getProperty("extractionPatterns.file.path");
72 File customPatternsXml = null;
73 if (path != null) {
74 customPatternsXml = new File(System.getProperty("extractionPatterns.file.path"));
75 }
76 if (customPatternsXml != null && customPatternsXml.exists() && customPatternsXml.isFile()) {
77
78 input = new FileInputStream(customPatternsXml);
79 log.info("Loaded extraction patterns file: "
80 + customPatternsXml.getAbsolutePath());
81 } else {
82
83 URL source = getClass().getResource("/extractPatterns.xml");
84 input = source.openStream();
85 log.info("Loaded default extraction patterns file");
86 }
87 Digester digester = new Digester();
88
89 List list = new ArrayList();
90 digester.push(list);
91 digester.addObjectCreate("extractionPatterns/pattern",
92 ConcretePattern.class);
93 digester.addSetNext("extractionPatterns/pattern",
94 "add",
95 "java.lang.Object");
96 digester.addBeanPropertySetter("extractionPatterns/pattern/expression",
97 "stringPattern");
98 digester.addSetProperties( "extractionPatterns/pattern", "group", "group" );
99 digester.addSetProperties( "extractionPatterns/pattern", "tagName", "tagName" );
100
101
102 digester.parse( input );
103 patterns = (AbstractPattern[]) list.toArray(new AbstractPattern[list.size()]);
104 } catch (Exception e) {
105 log.fatal("Unable to load extraction patterns", e);
106 } finally {
107 try {
108 input.close();
109 }
110 catch (Exception e){}
111
112 }
113 }
114
115 /***
116 *
117 * @return
118 */
119 public AbstractPattern[] getPatterns() {
120 return patterns;
121
122 }
123 }